github_dir <- "C:/Users/Karol/Desktop/Code for Github"

read_count_dir <- paste0(github_dir, "/Read_counts")

setwd(github_dir)


# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, warning = FALSE, error=FALSE, echo=TRUE, cache=TRUE, 
                      fig.width = 7, fig.height = 6, fig.align = 'left')

# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)

1. Data pre-processing

setwd(github_dir)

# Generate mm9 list of exon sizes 
# Method description at: https://www.biostars.org/p/83901/
txdb <- makeTxDbFromGFF("G:/Shared drives/Nord Lab - Personal Folders/Karol/Genomes/Mus_musculus/UCSC/mm9/Annotation/genes.gtf", format="gtf")
exons.list.per.gene <- exonsBy(txdb,by="gene")

# I paralelized this increasing the speed >2x on a 4-core (logical) machine. 
# IMPORTANT: Use mclapply instead of parLapply if you use a Mac.
cl <- makeCluster(detectCores())
exonic.gene.sizes <- parallel::parLapply(cl, exons.list.per.gene,function(x){sum(width(reduce(x)))})
stopCluster(cl)


# Reading and merging count tables

# These files contain P0 data
setwd(read_count_dir)

if (file.exists("Project_Nord_MERGED_HS674_mm9-50_fC0_UN.txt")==F) {
  count.1 <- read.table("Project_Nord_L3_HS674_mm9-50_fC0_UN.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
  count.2 <- read.table("Project_Nord_L7_HS674_mm9-50_fC0_UN.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
  count.3 <- read.table("Project_ANIZ_L6_H674P_Zdilar_mm9-50_fC0_UN.out", sep="\t", header=T, as.is=T, stringsAsFactors=F)
  count.3 <- count.3[,c(1,4,2,3,5:ncol(count.3))]
  count.final <- count.1[,1:12]
  count.final[,1] <- count.1[,1]
  for (index in 2:12) {
    count.final[,index] <- count.1[,index] + count.2[,index] + count.3[,index]
  }
  colnames(count.final) <- c("Gene", "MIA_P0_1.S44.L006", "MIA_P0_10.S53.L006", "MIA_P0_11.S54.L006", "MIA_P0_2.S45.L006", "MIA_P0_3.S46.L006", "MIA_P0_4.S47.L006", "MIA_P0_5.S48.L006","MIA_P0_6.S49.L006", "MIA_P0_7.S50.L006", "MIA_P0_8.S51.L006", "MIA_P0_9.S52.L006")
 #write.table(count.final, "Project_Nord_MERGED_HS674_mm9-50_fC0_UN.txt", sep="\t", col.names=T, row.names=F, quote=F)
}

# This is processing E12.5 and E17.5 data
# Merge count data from split row
# Renames columns in the count matrix
#setwd("G:/Shared drives/Nord Lab - Data/MIAx00/RNAseq/Results")
if (file.exists("ALL_MATRIX_RENAMED.txt")==F) {
  count.1 <- read.table("ALL_MATRIX.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
  update.col.names <- colnames(count.1)
  update.col.names <- gsub("X", "", update.col.names)
  sample.id <- strsplit(update.col.names, "_")
  e12.5.samples <- c("1623.1",
                     "1623.2",
                     "1635.4",
                     "1635.5",
                     "1644.3",
                     "1644.4",
                     "1646.3",
                     "1646.4",
                     "1819.2",
                     "1819.4",
                     "1819.5",
                     "1858.1")
  e17.5.samples <- c("1631.1",
                     "1631.2",
                     "1641.4",
                     "1641.5",
                     "1712.1",
                     "1712.3",
                     "1823.1",
                     "1823.2",
                     "1826.3",
                     "1826.4",
                     "1842.11",
                     "1842.1")
  
  for (index in 1:length(update.col.names)) {
    if ((strsplit(update.col.names[index], "_")[[1]][1] %in% e12.5.samples) == T) {
      update.col.names[index] <- gsub("_", "\\.", paste("MIA.e12.5.", gsub("_R1_001", "", update.col.names[index]), sep=""))
    }
    if ((strsplit(update.col.names[index], "_")[[1]][1] %in% e17.5.samples) == T) {
      update.col.names[index] <- gsub("_", "\\.", paste("MIA.e17.5.", gsub("_R1_001", "", update.col.names[index]), sep=""))
    }
  }

# Quick sanity check looking at colnames before and after renaming  
#data.frame(colnames(count.1), update.col.names)  
    
  count.final <- count.1
  colnames(count.final) <- update.col.names
  count.final <- count.final[,grep("Undetermined", update.col.names, invert = T)] # Removes "Undetermined" samples
  count.final <- cbind(Gene=rownames(count.final), count.final)
  #write.table(count.final, "ALL_MATRIX_RENAMED.txt", sep="\t", col.names=T, row.names=F, quote=F)
}

# Organizes sample.info
setwd(read_count_dir)
sample.info <- read.table("MIA.RNASeq.SampleInfo.20180710.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
sample.info[,2] <- gsub("_", ".", sample.info[,2])
sample.info[,2] <- gsub("-", ".", sample.info[,2])
sample.info[, "CountFile"] <- gsub("ALL_MATRIX.txt", "ALL_MATRIX_RENAMED.txt", sample.info[, "CountFile"])
sample.info <- sample.info[order(as.character(sample.info[,2])),]
sample.info[,"DPC"] <- ifelse(sample.info[,"DPC"]=="21", 19.5,sample.info[,"DPC"])

# Reading and merging the remaining count tables - generates exp.data object
setwd(read_count_dir)

count.filenames <- unique(sample.info[,"CountFile"])
for (index in 1:length(count.filenames)) {
  temp <- read.table(as.character(count.filenames[index]), sep="\t", header=T, as.is=T, stringsAsFactors = F)
  named.cols <- max(grep("MIA", colnames(temp)))
  temp <- temp[,1:named.cols]
  if (index == 1) {
    exp.data <- temp
    colnames(exp.data)[1] <- "Gene"
  } else {
    split.id <- colnames(temp)[1]
    exp.data <- merge(exp.data, temp, by.x="Gene", by.y=split.id)
  }
}

colnames(exp.data) <- gsub("_", ".", colnames(exp.data))
colnames(exp.data) <- gsub("-", ".", colnames(exp.data))
exp.data <- exp.data[,order(colnames(exp.data))]
rownames(exp.data) <- exp.data[,1]
exp.data <- exp.data[,2:ncol(exp.data)]

1.1 Identification of sample sex

A threshold of 1000 counts of Xist transcript was used to identify sex

# I'm checking if the 1000 count threshorld for Xist makes sense. It does. 

# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #

Xist_exp <- as.data.frame(melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")

ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
  geom_jitter()+
  geom_boxplot(alpha=0.2)+
  theme_bw()+
  labs(title="Xist read counts")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))

# Samples marked as outliers were removed.
# Identifies outliers
outlier <- ifelse(sample.info[,"Outlier"]=="1",1,0)         #Samples marked with 1 are Outliers 
outlier <- ifelse(is.na(sample.info[,"Outlier"]),0,outlier) #

#Preserving data with outliers
exp.data_w_outliers <- exp.data
sample.info_w_outliers <- sample.info

# Removes the outliers from the dataset
exp.data <- exp.data[,which(outlier==0)]                    #Filters exp.data columns(samples) which are not outliers
sample.info <- sample.info[which(outlier==0),]              #It does the same for the rows of sample info 

# Assigns group values 1 for Saline; 2 for PolyIC, 3 for Inhibitor
group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
group <- ifelse(sample.info[,"Condition"]=="Inhibitor",3,group) #
#group <- ifelse(sample.info[,"Condition"]=="unknown",4,group)
group <- factor(group)

# Assigns sex value, 1 for Male, 2 for Female and 3 for unknown.
# There may be a logical error in this assignment because there are 4 symbols deliminating sex "M" "F" "?" ""
# Therefore ?" "" get assigned the value of 2. 
#IMPORTANT: Check if for the DE expression sex is called from the RNA or from this column.
#ANSWER: No. Sex in the analysis is called from the Xist RNA expression. This sex calling is done just pro forma.

sex <- ifelse(sample.info[,"Sex"]=="M",1,2)            
sex <- ifelse(sample.info[,"Sex"]=="unknown",3,sex)    
sex <- factor(sex)

#Adds sex.by.rna and to sample.info. Double check if there wans't an error with assigning sex by RNA into the sample.info data.
sample.info <- data.frame(sample.info, sex_by_rna = sex.by.rna[which(outlier!=1)])
sample.info <- cbind(sample.info, ExperimentalDesign=paste(sample.info[,"DPC"], sample.info[,"Condition"], sep="_"))

# Writes updated sample.infor and exp.data to files
#write.table(sample.info, "UpdatedSampleInfo.txt", sep="\t", col.names=T, row.names=F)
#write.table(cbind(Gene=rownames(exp.data), exp.data), "MIA_GENE_MATRIX.txt", sep="\t", col.names=T, row.names=F)

# Renames columns in sample.info - housekeeping
info.col.names <- colnames(sample.info)
sample.info <- sample.info[,c(2,1,3:ncol(sample.info))]
colnames(sample.info) <- info.col.names

# And writes another file...
#write.table(sample.info, "MIA_SAMPLE_INFO.txt", sep="\t", col.names=T, row.names=F, quote=F)

# Writes a sample.info file lacking the inhibitor and lane 12 samples
#write.table(sample.info[which(group!=3 & sample.info[,"Lane"]!=12),], "MIA_SAMPLE_INFO_POLYIC.txt", sep="\t", col.names=T, row.names=F, quote=F)
#write.table(cbind(Gene=rownames(exp.data), exp.data), "MIA_GENE_MATRIX.txt", sep="\t", col.names=T, row.names=F, quote=F)

#datatable(sample.info, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))


# Generates vectors/factors from the outlier-filtered dataset for the analysis
sex.by.rna <- ifelse(exp.data["Xist",]>1000,"2","1")
sex.by.rna <- factor(sex.by.rna)
response <- ifelse(sample.info[,"Response"]=="med",1,2)
response <- factor(response)
#unknown.batch <- ifelse(sample.info[,"ControlOutlier"]=="0",1,2)
#unknown.batch <- factor(unknown.batch)
lane <- factor(sample.info[,"Lane"])
dpc <- sample.info[,"DPC"]
dpc <- ifelse(dpc=="P0", 19.5, dpc)
dpc <- ifelse(dpc=="21", 19.5, dpc)    # Why??
dpc <- ifelse(dpc=="e14.5", 14.5, dpc)
dpc <- ifelse(dpc=="e12.5", 12.5, dpc)
dpc <- ifelse(dpc=="e17.5", 17.5, dpc)
dpc <- as.factor(dpc)

rRNA <- exp.data["Rn45s",]/colSums(exp.data) #45S rRNA serves as the precursor for the 18S, 5.8S and 28S rRNA. It's used as a normalization factor
colnames(exp.data) <- paste(dpc, group, sex.by.rna, response, lane, sample.info[,"Library"], sep="_") #Interesting encoding
exp.original <- exp.data

# exp.data colnames encoding
colnames(exp.data) <- paste(dpc, group, sex.by.rna, response, lane, sample.info[,"Library"], sep="_")


# Calculates RPKM values

# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

# edgeR settings
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)

# Removes batch effect associated with lane of sequencing. "design: optional design matrix relating to treatment conditions to be preserved". I think this is may be incorrect since the exp.data contains raw counts.Run this on rpkm data instead. 
rpkm.batch <- removeBatchEffect(exp.data, batch=lane, design=cbind(dpc, group, sex.by.rna, response))


### Removes genes with low expression ###

# Lets plot a histogram of all rpkm values in a dataset.
# dim(rpkm.data) #24015 rows and 102 columns

df <- melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")

# Histogram of all rpkm values in a dataset.
#ggplot(df, aes(x=rpkm_log2)) + geom_histogram(bins = 1000, color="black")+theme_bw()+labs(title="Complete dataset of 24015 genes", x="log2 RPKMsfrom all samples")

# Setting the threshold for minimum rpkm value in at least 2 samples. 
threshold = -2

# That translates to 17195 genes out of a total of 24015
#sum(melt(rowSums(rpkm.data > threshold) >= 2)$value)

# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name

# 
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)

# Filters the "keep" genes 
datExpr_test <- filter(datExpr_test, gene_name %in% keep)

# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL

# Overwrites the original datExpr object.
datExpr <- datExpr_test


# Plots the histogram after filtering
df <- melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")

#ggplot(df, aes(x=rpkm_log2)) + geom_histogram(bins = 1000, color="black")+theme_bw()+labs(title=paste("After filtering", "n of genes" , nrow(datExpr),".", "Threshold in at least 2 samples >", threshold))

# Removing low expression genes. Overwrites exp.data object used for DE analysis ### 
#dim(exp.data)  #exp.data contains counts, datExpr contains rpkms 

exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))
#dim(exp.data)
#head(exp.data)
rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL 
#dim(exp.data)
#head(exp.data)

# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)

rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)

1.2 Sample metadata

datatable(sample.info[,setdiff(c(1:23), c(7, 8, 14, 15, 16, 18, 21:23))], 
          rownames = FALSE, 
          filter = list(position = 'top', clear = FALSE, plain = FALSE), 
          options = list(pageLength = 10, scrollX=T, scrollY=T))

1.3 PCA plots

setwd(github_dir)
source("Dimensionality reduction plots.r")

grid.arrange(PCA_plot_without_inhibitor_and_lane_12, PCA_plot_without_inhibitor_and_lane_12_sex, PCA_plot_w_inh, ncol = 3)

2. Saline vs PolyIC developmental time prediction linear model

control.datapoints <- intersect(which(group=="1"),  which(lane != "12"))
control.datapoints <- intersect(control.datapoints, which(rRNA < 0.01))
#control.datapoints <- intersect(control.datapoints, which(response == 1)) # keeping only medium responders doesn't preserve enought data

polyic.datapoints <- intersect(which(group=="2"),  which(lane != "12"))
polyic.datapoints <- intersect(polyic.datapoints,  which(rRNA < 0.01))
#polyic.datapoints <- intersect(polyic.datapoints, which(response == 1))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)


test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)

#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
new.control.datapoints <- which(use.cols %in% control.datapoints)
new.polyic.datapoints <- which(use.cols %in% polyic.datapoints)

train.data <- data.frame(PCA.1=pca.results$rotation[new.control.datapoints,1],
                         PCA.2=pca.results$rotation[new.control.datapoints,2],
                         PCA.3=pca.results$rotation[new.control.datapoints,3],
                         PCA.4=pca.results$rotation[new.control.datapoints,4],
                         PCA.5=pca.results$rotation[new.control.datapoints,5],
                         rRNA=as.numeric(rRNA[new.control.datapoints]),
                         sex=sex.by.rna[new.control.datapoints],
                         lane=lane[new.control.datapoints],
                         response=test.response[new.control.datapoints]
)

predict.data <- data.frame(PCA.1=pca.results$rotation[new.polyic.datapoints,1],
                         PCA.2=pca.results$rotation[new.polyic.datapoints,2],
                         PCA.3=pca.results$rotation[new.polyic.datapoints,3],
                         PCA.4=pca.results$rotation[new.polyic.datapoints,4],
                         PCA.5=pca.results$rotation[new.polyic.datapoints,5],
                         rRNA=as.numeric(rRNA[new.polyic.datapoints]),
                         sex=sex.by.rna[new.polyic.datapoints],
                         lane=lane[new.polyic.datapoints],
                         response=test.response[new.polyic.datapoints]
)

lm.model <- lm(as.numeric(as.character(dpc[control.datapoints])) ~
                 PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + rRNA + sex + as.numeric(lane) + response
               , data = train.data
)


# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)

# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(dpc[control.datapoints])==unique(dpc)[x])]))

dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(dpc[control.datapoints])==unique(dpc)[x])]))

dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(dpc[polyic.datapoints])==unique(dpc)[x])]))

dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(dpc[polyic.datapoints])==unique(dpc)[x])]))

dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)


lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)


####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(dpc[control.datapoints])),
                   "Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))


df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(dpc[polyic.datapoints])),
                   "Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))

df_combined <- rbind(df_1, df_2)


mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
             summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
                       sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
                       )

mean_data <- as.data.frame(mean_data)

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]


df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))


Fig_4c <- ggplot()+
  geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC,  color=Condition), size = 2, height= 0.3, alpha=0.7)+
  geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
  theme_bw()+
  scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
  scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  labs(y = "Predicted DPC", x = "Actual DPC")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Fig_4c

19.5 was used as a numerical representation of the P0 time point.

#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)

##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])

t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])

t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])

t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])

t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)

predicted_DPC_df <- melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))

t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))

predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))

knitr::kable(predicted_DPC_df_nice)
Real_DPC Predicted_DPC_Saline Predicted_DPC_PolyIC t_test_p_value
12.5 12.78274 12.46882 0.46920042
14.5 14.79024 15.57854 0.07329140
17.5 16.72892 18.72772 0.00000456
19.5 19.38742 19.26204 0.65641544

3. Differential expression

single_timepoint_glm_function<- function(x){

control.datapoints <- intersect(which(group=="1"),  which(lane != "12"))
control.datapoints <- intersect(control.datapoints, which(rRNA < 0.01))
control.datapoints <- intersect(control.datapoints, which(dpc == x))

polyic.datapoints <- intersect(which(group=="2"),  which(lane != "12"))
polyic.datapoints <- intersect(polyic.datapoints,  which(rRNA < 0.01))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)

3.1 Numbers of DE genes

E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))

E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))

E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))

E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))

###

E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))

E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))

E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))

E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))


DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
           "E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
           "E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
           "P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))

colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")


DE_df_n_melted <- melt(DE_df_n)

DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))

DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)


Fig2c <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
  geom_bar(position = "dodge",  stat="identity", color="black")+
   theme_bw()+
  #scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
  scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
  theme(legend.title=element_blank())+
  labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
  theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
  geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
  coord_flip()+
  scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
  theme(panel.border = element_blank(),
        #legend.key = element_blank(),
        axis.ticks = element_blank(),
        axis.text  = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  theme(legend.position="bottom")

Fig2c

volcano_plot_text_2 <- function(x, title) {

#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)

plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)


p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
 scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
   scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}


#################  Version capped at different y axis levels ###############

volcano_plot_text_3 <- function(x, title) {

#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)


plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)

plotDat$log10_PValue <- -log10(plotDat$PValue)

plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)

#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+

p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
  scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  expand_limits(x = c(-4.2, 4.2))+
  coord_cartesian(ylim = c(0, 8))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
  scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}

3.2 Volcano plots and tables

3.2.1 E12.5

p <- volcano_plot_text_3(filter(E12.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E12.5 GLM DE analysis")
p
datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

3.2.2 E14.5

p <- volcano_plot_text_3(filter(E14.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E14.5 GLM DE analysis")
p
datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

3.2.3 E17.5

p <- volcano_plot_text_2(filter(E17.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E17.5 GLM DE analysis")
p
datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

3.2.4 P0

p <- volcano_plot_text_3(filter(E19.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "P0 GLM DE analysis")
p
datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))